logo





Autism CRC Australian Eating Survey Data (ACRC + QTAB): Exploratory data analysis and PCs

1 Loading packages

library(rmarkdown)    # install.packages("rmarkdown") 
library(data.table)
library(epuRate)      # devtools::install_github("holtzy/epuRate", force=TRUE)
library(DT)
library(ggplot2)
library(plotly)
library(tidyverse)
library(magrittr) # mutate_at()
library(purrr) # map()
library(corrplot)
library(caret)
library(naniar) # replace_with_na_all()
library(seqinr) # swap()
library(reshape2) # melt()
library(ggfortify) # pca, but for this need quantitative data?
library(polycor) # for polychoric correlation
library(ggbiplot) # for pca
library(factoextra) # for pca
library(compositions) # for clr
library(car) # for 
library(vegan) # for

2 Loading data


Loading ACRC AES data and installing packages

3 Clean data

3.1 Prune down data file to relevant columns and individuals

3.1.1 ACRC

3.1.2 QTAB

3.1.3 Combine dataframes

acrc <- acrc %>% dplyr::select(-c("consent", "comments", "survey_id"))

qtab <- qtab %>% dplyr::select(-c("consent"))
qtab$participant_type <- "UNR_QTAB"

# Join dataframes (just use this function to get the columns in the right order essentially)
aes <- join(acrc, qtab, type = "full")

# Filter down to individuals used in analysis (ie. not all QTAB)
aes <- aes %>% filter(id %in% ids_analysis$Participant.ID)

3.2 Checks and cleans

  • N.B. QTAB date_started/date_finished incomplete. The ones that I do have are generally finished in July 2019 (ie. 08/07/2019). So have filled aes$date_finished with this.
# Rename participant_type
aes <- aes %>% mutate(participant_type=replace(participant_type, (participant_type %in% 1:2), "ASD"))
aes <- aes %>% mutate(participant_type=replace(participant_type, (participant_type == 3), "SIB"))
aes <- aes %>% mutate(participant_type=replace(participant_type, (participant_type == 4), "UNR_ACRC"))

# Create participant_group
aes$participant_group <- "NA"
aes <- aes %>% mutate(participant_group=replace(participant_group, (participant_type == "ASD"), "ASD"))
aes <- aes %>% mutate(participant_group=replace(participant_group, (participant_type == "SIB"), "SIB"))
aes <- aes %>% mutate(participant_group=replace(participant_group, (participant_type %in% c("UNR_ACRC", "UNR_QTAB")), "UNR"))

# Rename sex
# aes <- aes %>% mutate(sex=replace(sex, (sex == 1), "M"))
# aes <- aes %>% mutate(sex=replace(sex, (sex == 2), "F"))

# Check DOB
# Convert to standard format
aes$dob <- as.Date(aes$dob, format="%d/%m/%y")
aes$date_finished[which(aes$participant_type == "UNR_QTAB")] <- "8/7/19"
aes$date_finished <- as.Date(aes$date_finished, format="%d/%m/%y")
aes$age <- round(as.numeric(difftime(aes$date_finished, aes$dob, units = "days"))/365,2)
plot(aes$age)

min(aes$age[which(!is.na(aes$age))])
## [1] 2.07
max(aes$age[which(!is.na(aes$age))])
## [1] 17.62
# Check BMI is within limits
aes$height <- as.numeric(aes$height)
aes$weight <- as.numeric(aes$weight)
aes$bmi <- aes$weight/(aes$height/100)^2
plot(aes$bmi[which(!is.na(aes$bmi))])

min(aes$bmi[which(!is.na(aes$bmi))])
## [1] 8.115187
max(aes$bmi[which(!is.na(aes$bmi))])
## [1] 37.25738

3.3 Reorder levels of food frequency columns

  • exclude “milk” and “bread_type”: not as frequency variables (if you want to include, need to add back in to the below)

When running order_food_levels(), the following variables fail the check: - “milk”: Not in frequency levels –> exclude - “bread_type”: Not in frequency levels –> exclude - “grapes_etc”: [1] “1 - 3 per month” “1 per week” “2 - 4 per week” [4] “31” “5 or more per week” “Less than 1 per month” - There’s a stray “31”. Check other fields with similar levels (eg. mayonnaise, nuts) - unique(aes$nuts). Missing field is “Never”

3.3.1 Function

order_food_levels <- function(x) {

# Order the factor levels
# Rules:
# - Never = 0
# - month / week / day
    # - less
    # - numbers in order
    # - more

    tmp <- as.factor(x)
    tmp.levels <- levels(tmp) # Tests: order_food_levels(acrc$breakfast_cereal)
    
    never <- grep("Never", tmp.levels)
    month <- grep("month", tmp.levels)
    week <- grep("week", tmp.levels)
    day <- grep("day", tmp.levels)
    
    output = NULL
    
    if (length(never) == 1) {
        output <- tmp.levels[never]
    }
    
    if (length(month > 1)) {
        # Take month variables
        tmp.levels_month <- tmp.levels[month]

        # Take Less/More: immediately put at the start/end
        # Take levels within each time period bracket
        less <- grep("Less", tmp.levels_month)
        more <- grep("More", tmp.levels_month)
        if (length(less) > 0 | length(more > 0)) {
            tmp.levels_month2 <- tmp.levels_month[-c(less, more)]           
        } else {
            tmp.levels_month2 <- tmp.levels_month
        }
        # Take numeric factors and extract the numbers
        num <- as.numeric(gsub("([0-9]+).*$", "\\1", tmp.levels_month2))

        # Order the indices within the time category
        # Put the NA first (eg. "Once per week" is less frequent/goes before other numbers)
        tmp.levels_month2_ord <- tmp.levels_month2[order(num, na.last = FALSE)]
        monthlev <- c(tmp.levels_month[less], tmp.levels_month2_ord, tmp.levels_month[more])
        output <- c(output, monthlev)
    }   
    
    if (length(week > 1)) {
        # Take week variables
        tmp.levels_week <- tmp.levels[week]

        # Take Less/More: immediately put at the start/end
        # Take levels within each time period bracket
        less <- grep("Less", tmp.levels_week)
        more <- grep("More", tmp.levels_week)
        if (length(less) > 0 | length(more > 0)) {
            tmp.levels_week2 <- tmp.levels_week[-c(less, more)]         
        } else {
            tmp.levels_week2 <- tmp.levels_week
        }

        # Take numeric factors and extract the numbers
        num <- as.numeric(gsub("([0-9]+).*$", "\\1", tmp.levels_week2))

        # Order the indices within the time category
        # Put the NA first (eg. "Once per week" is less frequent/goes before other numbers)
        tmp.levels_week2_ord <- tmp.levels_week2[order(num, na.last = FALSE)]
        weeklev <- c(tmp.levels_week[less], tmp.levels_week2_ord, tmp.levels_week[more])
        output <- c(output, weeklev)
    }

    if (length(day > 1)) {
        # Take day variables
        tmp.levels_day <- tmp.levels[day]

        # Take Less/More: immediately put at the start/end
        # Take levels within each time period bracket
        less <- grep("Less", tmp.levels_day)
        more <- grep("More", tmp.levels_day)
        if (length(less) > 0 | length(more > 0)) {
            tmp.levels_day2 <- tmp.levels_day[-c(less, more)]           
        } else {
            tmp.levels_day2 <- tmp.levels_day
        }

        # Take numeric factors and extract the numbers
        num <- as.numeric(gsub("([0-9]+).*$", "\\1", tmp.levels_day2))

        # Order the indices within the time category
        # Put the NA first (eg. "Once per week" is less frequent/goes before other numbers)
        tmp.levels_day2_ord <- tmp.levels_day2[order(num, na.last = FALSE)]
        daylev <- c(tmp.levels_day[less], tmp.levels_day2_ord, tmp.levels_day[more])
        output <- c(output, daylev)
    }
    # Checks
    # - to capture all levels
    # - to show all unique
    print(c(length(output) == length(tmp.levels)) & length(unique(output)) == length(tmp.levels))
    
    return(output)

}

3.3.2 Re-format

food_col <- c("snacks","dairy","daily_softdrinks","diet_softdrink","softdrink","water_bottled_etc","juice","cordials","tea_coffee","beer","wine","spirits",
    "flavoured_milk","plain_milk","cream","icecream","frozen_yoghurt","yoghurt","cottage_cheese","cheese","cheese_spread",
    "muesli","porridge","breakfast_cereal","bread","muffin_etc","rice","other_grains","noodles","pasta",
    "cakes_etc","sweet_pastries","puddings","sweet_biscuits","cream_biscuits","savoury_biscuits","savoury_combinations","sweet_combinations",
    "snack_noodles","fruit_bars","snack_bars","muesli_bars",
    "mince","beef_lamb_wovege","beef_lamb_wvege","plain_meat_wovege","plain_meat_wvege",
    "chicken_wovege","chicken_wvege","crumbed_chicken","plain_chicken_wovege","plain_chicken_wvege",
    "pork_wovege","pork_wvege","plain_pork_wovege","plain_pork_wvege","liver",
    "crumbed_fish","fresh_fish","canned_fish","other_seafood",
    "creamy_soup","clear_soup",
    "tacos_etc","sausages_etc","hamburgers","pizza","pie_etc","hot_dog","savoury_pastries","hash_browns_etc",
    "twisties_etc","potato_crisps_etc",
    "creamy_iceblock","water_iceblock",
    "chocolate","lollies",
    "low_fat_dressing","mayonnaise",
    "nuts","jam_etc","peanut_butter_etc","vegemite_etc","tomato_sauce_etc","devon_etc","bacon_etc","eggs",
    "jelly","takeaway_fries","home_fires",
    "potato","pumpkin","sweet_potato","cauliflower","green_beans","spinach","cabbage_etc",
    "peas","broccoli","carrots","zucchini_etc","capsicum","corn_etc","mushrooms","tomatoes",
    "lettuce","celery_etc","avocado","onion_etc",
    "soybeans_etc","baked_beans","other_beans_etc",
    "canned_fruit","fruit_salad","dried_fruit",
    "apple_etc","orange_etc","banana","peach_etc","mango_etc","pineapple","grapes_etc","melon")

micro_col <- c("fibre","thiamin","riboflavin","niacinequiv","vitc","folate","vita","retinol","betacarotine","sodium","potassium","magnesium","calcium","phosphorus","iron","zinc")
macro_col <- c("protein","fats","satfats","polyfats","monofats","cholesterol","carbohydrate","sugars")

aes$grapes_etc[which(aes$grapes_etc == "31")] <- "Never"
# Get the factor level orders
aes_food_order <- sapply(aes[,food_col], order_food_levels)

# Order the values within the dataframe
for (i in 1:length(food_col)) {
    aes[,food_col[i]] <- ordered(aes[,food_col[i]], levels = aes_food_order[[i]])   
}

4 Basic demographics

# Cases vs sibling vs control
print("ASD vs Sibling vs Unrelated")
## [1] "ASD vs Sibling vs Unrelated"
table(aes$participant_type)
## 
##      ASD      SIB UNR_ACRC UNR_QTAB 
##       99       51       47       48
# Who completed survey
print("Who completed survey")
## [1] "Who completed survey"
table(aes$who_completedsurvey)
## 
##  a_parent caregiver     child  teenager 
##       232         3         7         3
# Age
ggplot(aes, aes(x = age, colour = participant_type, fill = participant_type)) + 
    geom_histogram(alpha = 0.1) +
    labs(legend.title = "Participant") +
    theme(legend.position = "bottom")

# Sex
tmp <- unique(aes$participant_type)
tmp.ls <- list()
for (i in 1:length(unique(tmp))) {
  tmp.ls[[i]] <- aes %>% filter(participant_type==tmp[i]) %>% dplyr::select("sex") %>% table()
}
foo <- do.call(rbind.data.frame, tmp.ls)
sex <- data.frame(Count=c(foo[,1], foo[,2]), 
                  Sex=c(rep("Female", length(tmp)), rep("Male", length(tmp))),
                  Group=rep(tmp,2))

# Plot
ggplot(sex, aes(x = Sex, y = Count, fill = Group)) + 
  geom_bar(stat = "identity", position = position_dodge())

5 Diversity in diet

diet_diversity <- data.frame(IID = aes$id, 
    participant_group = as.factor(aes$participant_group), 
    participant_type = as.factor(aes$participant_type), 
    age = aes$age, sex = aes$sex,
    shannon_micro = diversity(sapply(aes[,micro_col], as.numeric), "shannon"),
    shannon_macro = diversity(sapply(aes[,macro_col], as.numeric), "shannon"),
    shannon_food = diversity(sapply(aes[,food_col], function(x) as.numeric(as.factor(x))), "shannon"))

5.1 Plot

# Plot
diet_diversity.long <- melt(diet_diversity, 
    id.vars = c("IID", "participant_group", "participant_type", "age", "sex"),
    measure.vars = c("shannon_micro", "shannon_macro", "shannon_food"),
    variable.name = "diet_variable",
    value.name = "shannon")

ggplot(diet_diversity.long, 
    aes(x = participant_type, y = shannon, colour = participant_type)) + 
    geom_boxplot() +
    facet_grid(cols = vars(diet_variable))

# Focus on foods
ggplot(diet_diversity.long %>% filter(diet_variable == "shannon_food"), 
    aes(x = participant_type, y = shannon, colour = participant_type)) + 
    geom_boxplot() +
    facet_grid(cols = vars(diet_variable))

ggplot(diet_diversity.long %>% filter(diet_variable == "shannon_food"), 
    aes(x = participant_group, y = shannon, colour = participant_group)) + 
    geom_boxplot() +
    facet_grid(cols = vars(diet_variable))

5.2 Statistical tests

  • ANOVA (unadjusted + adjusted)
  • Levene’s test for difference in variance
  • DO see differences! In both means and variance
# ANOVA
# 1-way ANOVA

# Unadjusted for covariates
food.aov <- aov(shannon_food ~ participant_type, data = diet_diversity)
summary(food.aov)
##                   Df Sum Sq  Mean Sq F value  Pr(>F)    
## participant_type   3  0.021 0.006999    5.66 0.00092 ***
## Residuals        241  0.298 0.001237                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
food.aov <- aov(shannon_food ~ participant_group, data = diet_diversity)
summary(food.aov)
##                    Df  Sum Sq  Mean Sq F value   Pr(>F)    
## participant_group   2 0.02019 0.010095   8.175 0.000367 ***
## Residuals         242 0.29882 0.001235                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Adjusted
food_diversity <- diet_diversity %>% filter(!is.na(shannon_food))
food_diversity$shannon_food_resid <- residuals(lm(shannon_food ~ age + as.factor(sex), data = food_diversity))
food.aov <- aov(shannon_food_resid ~ participant_type, data = food_diversity)
summary(food.aov)
##                   Df  Sum Sq  Mean Sq F value   Pr(>F)    
## participant_type   3 0.02046 0.006820   5.832 0.000732 ***
## Residuals        241 0.28181 0.001169                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
food_diversity <- diet_diversity %>% filter(!is.na(shannon_food))
food_diversity$shannon_food_resid <- residuals(lm(shannon_food ~ age + as.factor(sex), data = food_diversity))
food.aov <- aov(shannon_food_resid ~ participant_group, data = food_diversity)
summary(food.aov)
##                    Df Sum Sq  Mean Sq F value  Pr(>F)   
## participant_group   2 0.0142 0.007102   5.966 0.00296 **
## Residuals         242 0.2881 0.001190                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Levene's test for difference in variance
leveneTest(shannon_food_resid ~ participant_group, data = food_diversity)

6 Metric 1: Macronutrients and general diet

  • eg. carbohydrates, protein, fats, water intake

6.1 Graph

# Subset macronutrients table
aes_macro <- aes %>% 
  dplyr::select(id,participant_group,age,sex,weight,height,energy,protein,fats,satfats,polyfats,monofats,cholesterol,carbohydrate,sugars,water)

aes_macro[,5:ncol(aes_macro)] <- sapply(aes_macro[,5:ncol(aes_macro)], as.numeric)

# Show the table
datatable(aes_macro %>% head(10) %>% select(-"id"), rownames = FALSE, filter="top", options = list(pageLength = 10, scrollX=T))
# Correlation plot
# library(corrplot)
correlations <- cor(aes_macro[,c(5,6,8:ncol(aes_macro))]) 
corrplot(correlations, order = "hclust", method = "circle")

# Distributions of continuous variables
# library(caret)
y <- as.factor(aes_macro$participant_group)
x <- aes_macro[,c(5,6,8:ncol(aes_macro))]
scales <- list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=x, y=y, plot="density", scales=scales, auto.key=list(columns=3))

6.2 PCA plot https://www.datacamp.com/community/tutorials/pca-analysis-r

aes_macro.pca <- prcomp(aes_macro[,c(8:ncol(aes_macro))], center = T, scale = T)
aes_macro.ind <- aes_macro$participant_group
ggbiplot::ggbiplot(aes_macro.pca, groups=aes_macro.ind)

7 Metric 2: Micronutrients

  • eg. vitamins and minerals

7.1 Graph

# Subset micronutrients table
aes_micro <- aes %>% 
  dplyr::select(id,participant_group,age,sex,weight,height,fibre,thiamin,riboflavin,niacin,niacinequiv,vitc,folate,vita,retinol,betacarotine,sodium,potassium,magnesium,calcium,phosphorus,iron,zinc)

aes_micro[,5:ncol(aes_micro)] <- sapply(aes_micro[,5:ncol(aes_micro)], as.numeric)

# Show the table
datatable(aes_micro %>% head(10) %>% select(-"id"), rownames = FALSE, filter="top", options = list(pageLength = 10, scrollX=T) )
# Correlation plot
# library(corrplot)
correlations <- cor(aes_micro[,c(5,6,8:ncol(aes_micro))]) 
corrplot(correlations, order = "hclust", method = "circle")

# Distributions of continuous variables
# library(caret)
y <- as.factor(aes_micro$participant_group)
x <- aes_micro[,c(5,6,8:ncol(aes_micro))]
scales <- list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=x, y=y, plot="density", scales=scales, auto.key=list(columns=3))

featurePlot(x=x, y=y, plot="strip", scales=scales, auto.key=list(columns=3))

7.2 PCA plot

aes_micro.pca <- prcomp(aes_micro[,c(8:ncol(aes_micro))], center = T, scale = T)
aes_micro.ind <- aes_micro$participant_group
ggbiplot::ggbiplot(aes_micro.pca, groups=aes_micro.ind)

8 Metric 3: Proportion of energy

8.1 By macronutrient

ie. what proportion of energy is contributed to by each macronutrient

# Display data
aes_pe_macro <- aes[,c("id", "participant_group", "age", "peprotein", "pecarbohydrate", "pefats", "pesatfats", "pepolyfats", "pemonofats")]

aes_pe_macro[,3:ncol(aes_pe_macro)] <- sapply(aes_pe_macro[,3:ncol(aes_pe_macro)], as.numeric)

datatable(aes_pe_macro %>% head(10) %>% select(-"id"), rownames = FALSE, filter="top", options = list(pageLength = 10, scrollX=T) )

8.1.1 Graph

# Box and whisker plot
aes_pe_macro.long <- melt(aes_pe_macro, id.vars=c("participant_group"), measure.vars=c("peprotein", "pecarbohydrate", "pefats", "pesatfats", "pepolyfats", "pemonofats"), variable.name="macronutrient")
colnames(aes_pe_macro.long) <- c("group", "macronutrient", "prop_energy")
aes_pe_macro.gg <- ggplot(aes_pe_macro.long, aes(x=macronutrient, y=prop_energy, colour=group)) +
  geom_boxplot()
aes_pe_macro.gg

8.2 By core (ie. healthy) vs. non-core (unhealthy) food

ie. what proportion of energy is contributed to by healthy foods (aka “core” foods)

8.2.1 Graph

aes_pe_core <- aes %>% dplyr::select(id, participant_group, age, pecore, penoncore)

aes_pe_core[,3:ncol(aes_pe_core)] <- sapply(aes_pe_core[,3:ncol(aes_pe_core)], as.numeric)

# Box and whisker plot
aes_pe_core.long <- melt(aes_pe_core, id.vars=c("participant_group"), measure.vars=c("pecore", "penoncore"), variable.name="core_noncore")
colnames(aes_pe_core.long) <- c("group", "core_noncore", "prop_energy")
aes_pe_core.gg <- ggplot(aes_pe_core.long, aes(x=core_noncore, y=prop_energy, colour=group)) +
  geom_boxplot()
aes_pe_core.gg

8.3 PCA plot

aes_pe_core.pca <- prcomp(aes_pe_core[,c(3:5)], center = T, scale = T)
aes_pe_core.ind <- aes_pe_core$participant_group
ggbiplot::ggbiplot(aes_pe_core.pca, groups=aes_pe_core.ind)

8.4 By food groups

  • Comparing groups with respect to what proportion of energy they gain from each food group

Can break down into:

  • Core foods: healthy eg. veg, fruit, meat, protein alternatives, grains, dairy
  • Non-core: unhealthy eg. sweet drink, packed snack, confectionery, baked products, takeaway, condiments, fatty meats

8.4.1 Graph

aes_pe_spec <- aes %>% dplyr::select(participant_group, age, peveg, pefruit, pemeat, pealt, pegrains, pedairy, pesweet_drink, pepack_snack, peconfect, pebaked_products, petakeaway, pecondiments, pefatty_meats)

aes_pe_spec[,3:ncol(aes_pe_spec)] <- sapply(aes_pe_spec[,3:ncol(aes_pe_spec)], as.numeric)

# Show the table
datatable(aes_pe_spec %>% head(10), rownames = FALSE, filter="top", options = list(pageLength = 10, scrollX=T))
# Correlation plot
# library(corrplot)
correlations <- cor(aes_pe_spec[,c(3:ncol(aes_pe_spec))]) 
corrplot(correlations, order = "hclust", method = "circle")

# Distributions of continuous variables
# library(caret)
y <- as.factor(aes_pe_spec$participant_group)
x <- aes_pe_spec[,c(4:ncol(aes_pe_spec))]
scales <- list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=x, y=y, plot="density", scales=scales, auto.key=list(columns=3))

# Box and whisker plot
#aes_pe_spec2 <- aes_pe_spec %>% select(-age) %>% select(-id)
aes_pe_spec.long <- melt(aes_pe_spec, id.vars=c("participant_group"), variable.name="specific_food")
colnames(aes_pe_spec.long) <- c("group", "specific_food", "prop_energy")
aes_pe_spec.gg <- ggplot(aes_pe_spec.long, aes(x=specific_food, y=prop_energy, colour=group)) +
  geom_boxplot() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
aes_pe_spec.gg

# Remove the ID column
aes_pe_spec <- aes_pe_spec %>% dplyr::select(-age)

8.5 PCA plot

aes_pe_spec.pca <- prcomp(aes_pe_spec[,c(3:ncol(aes_pe_spec))], center = T, scale = T)
aes_pe_spec.ind <- aes_pe_spec$participant_group
ggbiplot::ggbiplot(aes_pe_spec.pca, groups=aes_pe_spec.ind)

10 PC generation: CLR, % energy data, no covariate regression + add Bristol Stool Chart

  • As above, but apply clr transform to %energy data as it’s proportional

10.1 Centred-log ratio transformation

library(mixOmics)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## The following object is masked from 'package:plotly':
## 
##     select
## 
## Loaded mixOmics 6.10.9
## Thank you for using mixOmics!
## Tutorials: http://mixomics.org
## Bookdown vignette: https://mixomicsteam.github.io/Bookdown
## Questions, issues: Follow the prompts at http://mixomics.org/contact-us
## Cite us:  citation('mixOmics')
## 
## Attaching package: 'mixOmics'
## The following objects are masked from 'package:caret':
## 
##     nearZeroVar, plsda, splsda
## The following object is masked from 'package:purrr':
## 
##     map
aes_1193652_pe <- aes_1193652 %>% dplyr::select(id, participant_group, month, age, sex, bmi, peveg, pefruit, pemeat, pealt, pegrains, pedairy, pesweet_drink, pepack_snack, peconfect, pebaked_products, petakeaway, pecondiments, pefatty_meats)

aes_1193652_pe[,7:ncol(aes_1193652_pe)] <- sapply(aes_1193652_pe[,7:ncol(aes_1193652_pe)], as.numeric)

aes_1193652_pe.clr.tmp <- logratio.transfo(as.matrix(aes_1193652_pe[,7:ncol(aes_1193652_pe)]), logratio = "CLR", offset = 0.001)
class(aes_1193652_pe.clr.tmp) <- "matrix"
#aes_1193652_pe.clr.tmp <- clr(as.matrix(aes_1193652_pe[,7:ncol(aes_1193652_pe)]))
aes_1193652_pe.clr <- data.frame(aes_1193652_pe[,1:5], aes_1193652_pe.clr.tmp)

# Distributions of continuous variables
y <- as.factor(aes_1193652_pe.clr$participant_group)
x <- aes_1193652_pe.clr[,c(6:ncol(aes_1193652_pe.clr))]
scales <- list(x=list(relation="free"), y=list(relation="free"))
featurePlot(x=x, y=y, plot="density", scales=scales, auto.key=list(columns=3))

10.2 Generate PCs

aes_1193652_pe.clr.pca <- prcomp(aes_1193652_pe.clr[,c(6:ncol(aes_1193652_pe.clr))], center = T, scale = T)
aes_1193652_pe.clr.grp <- aes_1193652_pe.clr$participant_group
aes_1193652_pe.clr.ind <- aes_1193652_pe.clr$id
aes_1193652_pe.clr.age <- as.factor(round(aes_1193652_pe.clr$age, 0))

# Generate plots
ggbiplot::ggbiplot(aes_1193652_pe.clr.pca, groups=aes_1193652_pe.clr.grp, labels=aes_1193652_pe.clr.ind, ellipse = TRUE)

10.3 Check PC interpretability

# PC loadings per questionnaire
aes_1193652_pe.clr.lod <- aes_1193652_pe.clr.pca$rotation
datatable(aes_1193652_pe.clr.lod) %>% formatRound("PC1", 3) %>% formatRound("PC2", 3) %>% formatRound("PC3", 3) %>% formatRound("PC4", 3) %>% formatRound("PC5", 3) %>% formatRound("PC6", 3) %>% formatRound("PC7", 3) %>% formatRound("PC8", 3) %>% formatRound("PC9", 3) %>% formatRound("PC10", 3) %>% formatRound("PC11", 3) %>% formatRound("PC12", 3) %>% formatRound("PC13", 3) 
aes_1193652_pe.clr.lod.long <- melt(aes_1193652_pe.clr.lod)
colnames(aes_1193652_pe.clr.lod.long) <- c("Food_group", "PC", "Loading")
ggplot(data = aes_1193652_pe.clr.lod.long, aes(x = PC, y = Loading, fill = Food_group)) +
  geom_bar(stat="identity", position=position_dodge())

aes_1193652_pe.clr.lod.long1to5 <- aes_1193652_pe.clr.lod.long %>% filter(PC %in% c("PC1", "PC2", "PC3", "PC4", "PC5"))
ggplot(data = aes_1193652_pe.clr.lod.long1to5, aes(x = PC, y = Loading, fill = Food_group)) + 
  geom_bar(stat="identity", position=position_dodge())

10.4 Check variance explained per PC

fviz_eig(aes_1193652_pe.clr.pca)

print(summary(aes_1193652_pe.clr.pca))
## Importance of components:
##                           PC1    PC2    PC3     PC4     PC5     PC6     PC7
## Standard deviation     1.4448 1.3865 1.2325 1.11562 1.09232 1.05038 0.94282
## Proportion of Variance 0.1606 0.1479 0.1169 0.09574 0.09178 0.08487 0.06838
## Cumulative Proportion  0.1606 0.3085 0.4253 0.52105 0.61283 0.69770 0.76607
##                            PC8     PC9    PC10    PC11    PC12      PC13
## Standard deviation     0.87328 0.86900 0.78922 0.73301 0.60254 5.208e-16
## Proportion of Variance 0.05866 0.05809 0.04791 0.04133 0.02793 0.000e+00
## Cumulative Proportion  0.82474 0.88283 0.93074 0.97207 1.00000 1.000e+00

10.4.1 Compare PCs by group

aes_1193652_pe.clr.pcs <- aes_1193652_pe.clr.pca$x
aes_1193652_pe.clr.pcs <- data.frame(id = aes_1193652_pe.clr.ind, participant_group = aes_1193652_pe.clr$participant_group, aes_1193652_pe.clr.pcs)

aes_1193652_pe.clr.pcs %>% 
  group_by(participant_group) %>%
  dplyr::summarise(mean_PC1 = round(mean(PC1),2), sd_PC1 = round(sd(PC1),2),
                   mean_PC2 = round(mean(PC2),2), sd_PC2 = round(sd(PC2),2),
                   mean_PC3 = round(mean(PC3),2), sd_PC3 = round(sd(PC3),2))
## `summarise()` ungrouping output (override with `.groups` argument)
# Check for differences between UNR and UNR_QTAB groups
aes_1193652_pe.clr.pcs2 <- aes_1193652_pe.clr.pcs
aes_1193652_pe.clr.pcs2$participant_group <- as.character(aes_1193652_pe.clr.pcs2$participant_group)
aes_1193652_pe.clr.pcs2$id <- as.character(aes_1193652_pe.clr.pcs2$id)
aes_1193652_pe.clr.pcs2[which(nchar(aes_1193652_pe.clr.pcs2$id) == 6),"participant_group"] <- "UNR_QTAB"

aes_1193652_pe.clr.pcs2 %>% 
  group_by(participant_group) %>%
  dplyr::summarise(mean_PC1 = round(mean(PC1),2), sd_PC1 = round(sd(PC1),2),
                   mean_PC2 = round(mean(PC2),2), sd_PC2 = round(sd(PC2),2),
                   mean_PC3 = round(mean(PC3),2), sd_PC3 = round(sd(PC3),2))
## `summarise()` ungrouping output (override with `.groups` argument)



 




A work by by Chloe Yap - 13 June 2021